/*
 * Copyright (c) 2018-2019 Arm Limited.
 *
 * SPDX-License-Identifier: MIT
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to
 * deal in the Software without restriction, including without limitation the
 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
 * sell copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#include "impl_fp32_fp32.hpp"

namespace depthwise
{

using namespace neon_convolution_kernels;
using Conv = DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;

#ifdef __aarch64__
template <>
template <>
void Conv::execute_tile<ActivationFunction::None>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *input,
  const unsigned int input_row_stride,
  const unsigned int input_col_stride,
  float *output,
  const unsigned int output_row_stride,
  const unsigned int output_col_stride
)
{
  __asm __volatile(
    "add x26, %[inptr0], %[input_row_stride]\n"
    "add x21, %[input_col_stride1], %[input_col_stride1]\n"
    "add x23, %[outptr0], %[output_row_stride]\n"
    "add x27, x26, %[input_row_stride]\n"
    "add x22, x21, %[input_col_stride1]\n"
    "and x24, %[n_channels], #3\n"
    "add x28, x27, %[input_row_stride]\n"
    "lsr x25, %[n_channels], #2\n"
    "cbz x25, 4f\n"
    "1:\n"
    "ldr q15, [%[wbptr]]\n"
    "subs x25, x25, #1\n"
    "mov v3.16b, v15.16b\n"
    "ldr q14, [%[wbptr], #16]\n"
    "mov v1.16b, v15.16b\n"
    "ldr q13, [%[wbptr], #32]\n"
    "mov v2.16b, v15.16b\n"
    "ldr q12, [%[wbptr], #48]\n"
    "mov v0.16b, v15.16b\n"
    "ldr q11, [%[wbptr], #64]\n"
    "ldr q10, [%[wbptr], #80]\n"
    "ldr q9, [%[wbptr], #96]\n"
    "ldr q8, [%[wbptr], #112]\n"
    "ldr q7, [%[wbptr], #128]\n"
    "ldr q6, [%[wbptr], #144]\n"
    "ldr q24, [%[inptr0]]\n"
    "fmla v3.4s, v24.4s, v14.4s\n"
    "ldr q22, [x26]\n"
    "fmla v1.4s, v22.4s, v14.4s\n"
    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v2.4s, v19.4s, v14.4s\n"
    "ldr q18, [x27]\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "ldr q21, [x26, %[input_col_stride1]]\n"
    "fmla v1.4s, v18.4s, v11.4s\n"
    "ldr q17, [%[inptr0], x21]\n"
    "ldr q20, [x28]\n"
    "ldr q5, [x27, %[input_col_stride1]]\n"
    "fmla v3.4s, v19.4s, v13.4s\n"
    "fmla v3.4s, v18.4s, v8.4s\n"
    "beq 3f\n"
    "2:\n"
    "fmla v3.4s, v21.4s, v10.4s\n"
    "ldr q19, [x26, x21]\n"
    "fmla v1.4s, v21.4s, v13.4s\n"
    "ldr q23, [%[inptr0], x22]\n"
    "fmla v2.4s, v21.4s, v11.4s\n"
    "ldr q22, [x28, %[input_col_stride1]]\n"
    "fmla v0.4s, v21.4s, v14.4s\n"
    "ldr q21, [x27, x21]\n"
    "fmla v3.4s, v17.4s, v12.4s\n"
    "ldr q18, [x26, x22]\n"
    "fmla v2.4s, v17.4s, v13.4s\n"
    "ldr q16, [x28, x21]\n"
    "fmla v1.4s, v20.4s, v8.4s\n"
    "ldr q20, [x27, x22]\n"
    "fmla v3.4s, v5.4s, v7.4s\n"
    "ldr q4, [x28, x22]\n"
    "fmla v2.4s, v5.4s, v8.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v1.4s, v5.4s, v10.4s\n"
    "ldr q15, [%[wbptr]]\n"
    "fmla v0.4s, v5.4s, v11.4s\n"
    "ldr q14, [%[wbptr], #16]\n"
    "fmla v3.4s, v19.4s, v9.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v1.4s, v19.4s, v12.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v2.4s, v19.4s, v10.4s\n"
    "ldr q11, [%[wbptr], #64]\n"
    "fmla v0.4s, v19.4s, v13.4s\n"
    "ldr q24, [%[inptr0]]\n"
    "fmla v1.4s, v22.4s, v7.4s\n"
    "ldr q19, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v2.4s, v23.4s, v12.4s\n"
    "ldr q17, [%[inptr0], x21]\n"
    "fmla v0.4s, v22.4s, v8.4s\n"
    "ldr q13, [%[wbptr], #32]\n"
    "fmla v3.4s, v21.4s, v6.4s\n"
    "add x26, x26, #16\n"
    "fmla v1.4s, v21.4s, v9.4s\n"
    "ldr q22, [x26]\n"
    "fmla v2.4s, v21.4s, v7.4s\n"
    "ldr q8, [%[wbptr], #112]\n"
    "str q3, [%[outptr0]]\n"
    "fmla v0.4s, v21.4s, v10.4s\n"
    "fmla v1.4s, v16.4s, v6.4s\n"
    "ldr q21, [x26, %[input_col_stride1]]\n"
    "fmla v2.4s, v18.4s, v9.4s\n"
    "add x27, x27, #16\n"
    "fmla v0.4s, v18.4s, v12.4s\n"
    "ldr q10, [%[wbptr], #80]\n"
    "str q1, [x23]\n"
    "mov v3.16b, v15.16b\n"
    "fmla v2.4s, v20.4s, v6.4s\n"
    "ldr q18, [x27]\n"
    "fmla v0.4s, v16.4s, v7.4s\n"
    "ldr q12, [%[wbptr], #48]\n"
    "mov v1.16b, v15.16b\n"
    "ldr q5, [x27, %[input_col_stride1]]\n"
    "str q2, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v3.4s, v24.4s, v14.4s\n"
    "fmla v0.4s, v20.4s, v9.4s\n"
    "ldr q7, [%[wbptr], #128]\n"
    "mov v2.16b, v15.16b\n"
    "add x28, x28, #16\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "ldr q20, [x28]\n"
    "fmla v0.4s, v4.4s, v6.4s\n"
    "ldr q9, [%[wbptr], #96]\n"
    "fmla v1.4s, v22.4s, v14.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmla v3.4s, v19.4s, v13.4s\n"
    "subs x25, x25, #1\n"
    "str q0, [x23, %[output_col_stride1]]\n"
    "fmla v2.4s, v19.4s, v14.4s\n"
    "ldr q6, [%[wbptr], #144]\n"
    "add x23, x23, #16\n"
    "fmla v3.4s, v18.4s, v8.4s\n"
    "fmla v1.4s, v18.4s, v11.4s\n"
    "mov v0.16b, v15.16b\n"
    "bne 2b\n"
    "3:\n"
    "fmla v3.4s, v21.4s, v10.4s\n"
    "ldr q19, [x26, x21]\n"
    "fmla v1.4s, v21.4s, v13.4s\n"
    "ldr q23, [%[inptr0], x22]\n"
    "fmla v2.4s, v21.4s, v11.4s\n"
    "ldr q22, [x28, %[input_col_stride1]]\n"
    "fmla v0.4s, v21.4s, v14.4s\n"
    "ldr q21, [x27, x21]\n"
    "fmla v3.4s, v17.4s, v12.4s\n"
    "ldr q18, [x26, x22]\n"
    "fmla v2.4s, v17.4s, v13.4s\n"
    "ldr q16, [x28, x21]\n"
    "fmla v1.4s, v20.4s, v8.4s\n"
    "ldr q20, [x27, x22]\n"
    "fmla v3.4s, v5.4s, v7.4s\n"
    "ldr q4, [x28, x22]\n"
    "fmla v2.4s, v5.4s, v8.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v1.4s, v5.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v0.4s, v5.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v3.4s, v19.4s, v9.4s\n"
    "add x26, x26, #16\n"
    "fmla v1.4s, v19.4s, v12.4s\n"
    "add x27, x27, #16\n"
    "fmla v2.4s, v19.4s, v10.4s\n"
    "add x28, x28, #16\n"
    "fmla v0.4s, v19.4s, v13.4s\n"
    "fmla v3.4s, v21.4s, v6.4s\n"
    "fmla v1.4s, v22.4s, v7.4s\n"
    "fmla v2.4s, v23.4s, v12.4s\n"
    "str q3, [%[outptr0]]\n"
    "fmla v0.4s, v22.4s, v8.4s\n"
    "fmla v1.4s, v21.4s, v9.4s\n"
    "fmla v2.4s, v21.4s, v7.4s\n"
    "fmla v0.4s, v21.4s, v10.4s\n"
    "fmla v1.4s, v16.4s, v6.4s\n"
    "fmla v2.4s, v18.4s, v9.4s\n"
    "fmla v0.4s, v18.4s, v12.4s\n"
    "str q1, [x23]\n"
    "fmla v2.4s, v20.4s, v6.4s\n"
    "fmla v0.4s, v16.4s, v7.4s\n"
    "str q2, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v0.4s, v20.4s, v9.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmla v0.4s, v4.4s, v6.4s\n"
    "str q0, [x23, %[output_col_stride1]]\n"
    "add x23, x23, #16\n"
    "4:\n"
    "cbz x24, 7f\n"
    "ldr s15, [%[wbptr]]\n"
    "mov v3.16b, v15.16b\n"
    "ldr s14, [%[wbptr], #4]\n"
    "mov v1.16b, v15.16b\n"
    "ldr s13, [%[wbptr], #8]\n"
    "mov v2.16b, v15.16b\n"
    "ldr s12, [%[wbptr], #12]\n"
    "mov v0.16b, v15.16b\n"
    "ldr s11, [%[wbptr], #16]\n"
    "ldr s10, [%[wbptr], #20]\n"
    "subs x24, x24, #1\n"
    "ldr s9, [%[wbptr], #24]\n"
    "ldr s8, [%[wbptr], #28]\n"
    "ldr s7, [%[wbptr], #32]\n"
    "ldr s6, [%[wbptr], #36]\n"
    "ldr s24, [%[inptr0]]\n"
    "ldr s22, [x26]\n"
    "fmla v3.4s, v24.4s, v14.4s\n"
    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v1.4s, v22.4s, v14.4s\n"
    "ldr s18, [x27]\n"
    "fmla v2.4s, v19.4s, v14.4s\n"
    "ldr s21, [x26, %[input_col_stride1]]\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "ldr s17, [%[inptr0], x21]\n"
    "fmla v1.4s, v18.4s, v11.4s\n"
    "ldr s20, [x28]\n"
    "ldr s5, [x27, %[input_col_stride1]]\n"
    "fmla v3.4s, v19.4s, v13.4s\n"
    "fmla v3.4s, v18.4s, v8.4s\n"
    "beq 6f\n"
    "5:\n"
    "fmla v3.4s, v21.4s, v10.4s\n"
    "ldr s19, [x26, x21]\n"
    "fmla v1.4s, v21.4s, v13.4s\n"
    "ldr s23, [%[inptr0], x22]\n"
    "fmla v2.4s, v21.4s, v11.4s\n"
    "ldr s22, [x28, %[input_col_stride1]]\n"
    "fmla v0.4s, v21.4s, v14.4s\n"
    "ldr s21, [x27, x21]\n"
    "fmla v3.4s, v17.4s, v12.4s\n"
    "ldr s18, [x26, x22]\n"
    "fmla v2.4s, v17.4s, v13.4s\n"
    "ldr s16, [x28, x21]\n"
    "fmla v1.4s, v20.4s, v8.4s\n"
    "ldr s20, [x27, x22]\n"
    "fmla v3.4s, v5.4s, v7.4s\n"
    "ldr s4, [x28, x22]\n"
    "fmla v2.4s, v5.4s, v8.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v1.4s, v5.4s, v10.4s\n"
    "ldr s15, [%[wbptr]]\n"
    "fmla v0.4s, v5.4s, v11.4s\n"
    "ldr s14, [%[wbptr], #4]\n"
    "fmla v3.4s, v19.4s, v9.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v1.4s, v19.4s, v12.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v2.4s, v19.4s, v10.4s\n"
    "ldr s11, [%[wbptr], #16]\n"
    "fmla v0.4s, v19.4s, v13.4s\n"
    "ldr s24, [%[inptr0]]\n"
    "fmla v1.4s, v22.4s, v7.4s\n"
    "ldr s19, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v2.4s, v23.4s, v12.4s\n"
    "ldr s17, [%[inptr0], x21]\n"
    "fmla v0.4s, v22.4s, v8.4s\n"
    "ldr s13, [%[wbptr], #8]\n"
    "fmla v3.4s, v21.4s, v6.4s\n"
    "add x26, x26, #4\n"
    "fmla v1.4s, v21.4s, v9.4s\n"
    "ldr s22, [x26]\n"
    "fmla v2.4s, v21.4s, v7.4s\n"
    "ldr s8, [%[wbptr], #28]\n"
    "str s3, [%[outptr0]]\n"
    "fmla v0.4s, v21.4s, v10.4s\n"
    "fmla v1.4s, v16.4s, v6.4s\n"
    "ldr s21, [x26, %[input_col_stride1]]\n"
    "fmla v2.4s, v18.4s, v9.4s\n"
    "add x27, x27, #4\n"
    "fmla v0.4s, v18.4s, v12.4s\n"
    "ldr s10, [%[wbptr], #20]\n"
    "str s1, [x23]\n"
    "mov v3.16b, v15.16b\n"
    "fmla v2.4s, v20.4s, v6.4s\n"
    "ldr s18, [x27]\n"
    "fmla v0.4s, v16.4s, v7.4s\n"
    "ldr s12, [%[wbptr], #12]\n"
    "mov v1.16b, v15.16b\n"
    "ldr s5, [x27, %[input_col_stride1]]\n"
    "str s2, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v3.4s, v24.4s, v14.4s\n"
    "fmla v0.4s, v20.4s, v9.4s\n"
    "ldr s7, [%[wbptr], #32]\n"
    "mov v2.16b, v15.16b\n"
    "add x28, x28, #4\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "ldr s20, [x28]\n"
    "fmla v0.4s, v4.4s, v6.4s\n"
    "ldr s9, [%[wbptr], #24]\n"
    "fmla v1.4s, v22.4s, v14.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmla v3.4s, v19.4s, v13.4s\n"
    "subs x24, x24, #1\n"
    "str s0, [x23, %[output_col_stride1]]\n"
    "fmla v2.4s, v19.4s, v14.4s\n"
    "ldr s6, [%[wbptr], #36]\n"
    "add x23, x23, #4\n"
    "fmla v3.4s, v18.4s, v8.4s\n"
    "fmla v1.4s, v18.4s, v11.4s\n"
    "mov v0.16b, v15.16b\n"
    "bne 5b\n"
    "6:\n"
    "fmla v3.4s, v21.4s, v10.4s\n"
    "ldr s19, [x26, x21]\n"
    "fmla v1.4s, v21.4s, v13.4s\n"
    "ldr s23, [%[inptr0], x22]\n"
    "fmla v2.4s, v21.4s, v11.4s\n"
    "ldr s22, [x28, %[input_col_stride1]]\n"
    "fmla v0.4s, v21.4s, v14.4s\n"
    "ldr s21, [x27, x21]\n"
    "fmla v3.4s, v17.4s, v12.4s\n"
    "ldr s18, [x26, x22]\n"
    "fmla v2.4s, v17.4s, v13.4s\n"
    "ldr s16, [x28, x21]\n"
    "fmla v1.4s, v20.4s, v8.4s\n"
    "ldr s20, [x27, x22]\n"
    "fmla v3.4s, v5.4s, v7.4s\n"
    "ldr s4, [x28, x22]\n"
    "fmla v2.4s, v5.4s, v8.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v1.4s, v5.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v0.4s, v5.4s, v11.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v3.4s, v19.4s, v9.4s\n"
    "add x26, x26, #4\n"
    "fmla v1.4s, v19.4s, v12.4s\n"
    "add x27, x27, #4\n"
    "fmla v2.4s, v19.4s, v10.4s\n"
    "add x28, x28, #4\n"
    "fmla v0.4s, v19.4s, v13.4s\n"
    "fmla v3.4s, v21.4s, v6.4s\n"
    "fmla v1.4s, v22.4s, v7.4s\n"
    "fmla v2.4s, v23.4s, v12.4s\n"
    "str s3, [%[outptr0]]\n"
    "fmla v0.4s, v22.4s, v8.4s\n"
    "fmla v1.4s, v21.4s, v9.4s\n"
    "fmla v2.4s, v21.4s, v7.4s\n"
    "fmla v0.4s, v21.4s, v10.4s\n"
    "fmla v1.4s, v16.4s, v6.4s\n"
    "fmla v2.4s, v18.4s, v9.4s\n"
    "fmla v0.4s, v18.4s, v12.4s\n"
    "str s1, [x23]\n"
    "fmla v2.4s, v20.4s, v6.4s\n"
    "fmla v0.4s, v16.4s, v7.4s\n"
    "str s2, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v0.4s, v20.4s, v9.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmla v0.4s, v4.4s, v6.4s\n"
    "str s0, [x23, %[output_col_stride1]]\n"
    "add x23, x23, #4\n"
    "7:\n"
    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [output_col_stride1] "r" (output_col_stride * sizeof(float))
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
  );
}

template <>
template <>
void Conv::execute_tile<ActivationFunction::ReLU>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *input,
  const unsigned int input_row_stride,
  const unsigned int input_col_stride,
  float *output,
  const unsigned int output_row_stride,
  const unsigned int output_col_stride
)
{
  __asm __volatile(
    "add x21, %[inptr0], %[input_row_stride]\n"
    "add x24, %[input_col_stride1], %[input_col_stride1]\n"
    "add x22, %[outptr0], %[output_row_stride]\n"
    "add x23, x21, %[input_row_stride]\n"
    "add x27, x24, %[input_col_stride1]\n"
    "and x25, %[n_channels], #3\n"
    "add x28, x23, %[input_row_stride]\n"
    "lsr x26, %[n_channels], #2\n"
    "cbz x26, 4f\n"
    "1:\n"
    "ldr q11, [%[wbptr]]\n"
    "subs x26, x26, #1\n"
    "mov v17.16b, v11.16b\n"
    "ldr q13, [%[wbptr], #16]\n"
    "mov v15.16b, v11.16b\n"
    "ldr q4, [%[wbptr], #32]\n"
    "mov v16.16b, v11.16b\n"
    "ldr q2, [%[wbptr], #48]\n"
    "mov v14.16b, v11.16b\n"
    "ldr q5, [%[wbptr], #64]\n"
    "ldr q10, [%[wbptr], #80]\n"
    "ldr q1, [%[wbptr], #96]\n"
    "ldr q12, [%[wbptr], #112]\n"
    "ldr q0, [%[wbptr], #128]\n"
    "ldr q3, [%[wbptr], #144]\n"
    "ldr q6, [%[inptr0]]\n"
    "fmla v17.4s, v6.4s, v13.4s\n"
    "ldr q27, [x21]\n"
    "fmla v15.4s, v27.4s, v13.4s\n"
    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v16.4s, v23.4s, v13.4s\n"
    "ldr q24, [x23]\n"
    "fmla v17.4s, v27.4s, v5.4s\n"
    "ldr q22, [x21, %[input_col_stride1]]\n"
    "ldr q9, [%[inptr0], x24]\n"
    "ldr q8, [x28]\n"
    "ldr q20, [x23, %[input_col_stride1]]\n"
    "fmla v17.4s, v23.4s, v4.4s\n"
    "beq 3f\n"
    "2:\n"
    "fmla v17.4s, v24.4s, v12.4s\n"
    "ldr q26, [x21, x24]\n"
    "fmla v15.4s, v24.4s, v5.4s\n"
    "ldr q27, [%[inptr0], x27]\n"
    "fmla v16.4s, v22.4s, v5.4s\n"
    "ldr q25, [x28, %[input_col_stride1]]\n"
    "fmla v17.4s, v22.4s, v10.4s\n"
    "ldr q24, [x23, x24]\n"
    "fmla v15.4s, v22.4s, v4.4s\n"
    "ldr q21, [x21, x27]\n"
    "fmla v14.4s, v22.4s, v13.4s\n"
    "ldr q7, [x28, x24]\n"
    "fmla v17.4s, v9.4s, v2.4s\n"
    "ldr q19, [x23, x27]\n"
    "fmla v16.4s, v9.4s, v4.4s\n"
    "ldr q18, [x28, x27]\n"
    "fmla v15.4s, v8.4s, v12.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v17.4s, v20.4s, v0.4s\n"
    "ldr q11, [%[wbptr]]\n"
    "fmla v16.4s, v20.4s, v12.4s\n"
    "ldr q13, [%[wbptr], #16]\n"
    "fmla v15.4s, v20.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v14.4s, v20.4s, v5.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v17.4s, v26.4s, v1.4s\n"
    "ldr q6, [%[inptr0]]\n"
    "fmla v15.4s, v26.4s, v2.4s\n"
    "ldr q23, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v16.4s, v26.4s, v10.4s\n"
    "ldr q5, [%[wbptr], #64]\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "ldr q9, [%[inptr0], x24]\n"
    "fmla v15.4s, v25.4s, v0.4s\n"
    "add x21, x21, #16\n"
    "fmla v16.4s, v27.4s, v2.4s\n"
    "ldr q27, [x21]\n"
    "fmla v14.4s, v25.4s, v12.4s\n"
    "ldr q4, [%[wbptr], #32]\n"
    "fmla v17.4s, v24.4s, v3.4s\n"
    "ldr q22, [x21, %[input_col_stride1]]\n"
    "fmla v15.4s, v24.4s, v1.4s\n"
    "add x23, x23, #16\n"
    "fmla v16.4s, v24.4s, v0.4s\n"
    "ldr q12, [%[wbptr], #112]\n"
    "fmla v14.4s, v24.4s, v10.4s\n"
    "ldr q24, [x23]\n"
    "fmla v15.4s, v7.4s, v3.4s\n"
    "ldr q20, [x23, %[input_col_stride1]]\n"
    "fmla v16.4s, v21.4s, v1.4s\n"
    "add x28, x28, #16\n"
    "fmla v14.4s, v21.4s, v2.4s\n"
    "ldr q10, [%[wbptr], #80]\n"
    "movi v26.16b, #0\n"
    "ldr q8, [x28]\n"
    "fmla v16.4s, v19.4s, v3.4s\n"
    "subs x26, x26, #1\n"
    "fmla v14.4s, v7.4s, v0.4s\n"
    "ldr q2, [%[wbptr], #48]\n"
    "fmax v17.4s, v17.4s, v26.4s\n"
    "fmax v15.4s, v15.4s, v26.4s\n"
    "fmax v16.4s, v16.4s, v26.4s\n"
    "str q17, [%[outptr0]]\n"
    "str q16, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v14.4s, v19.4s, v1.4s\n"
    "str q15, [x22]\n"
    "mov v17.16b, v11.16b\n"
    "mov v15.16b, v11.16b\n"
    "ldr q0, [%[wbptr], #128]\n"
    "fmla v14.4s, v18.4s, v3.4s\n"
    "ldr q1, [%[wbptr], #96]\n"
    "mov v16.16b, v11.16b\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmla v17.4s, v6.4s, v13.4s\n"
    "fmla v15.4s, v27.4s, v13.4s\n"
    "fmax v14.4s, v14.4s, v26.4s\n"
    "ldr q3, [%[wbptr], #144]\n"
    "fmla v16.4s, v23.4s, v13.4s\n"
    "str q14, [x22, %[output_col_stride1]]\n"
    "mov v14.16b, v11.16b\n"
    "add x22, x22, #16\n"
    "fmla v17.4s, v27.4s, v5.4s\n"
    "fmla v17.4s, v23.4s, v4.4s\n"
    "bne 2b\n"
    "3:\n"
    "fmla v17.4s, v24.4s, v12.4s\n"
    "ldr q26, [x21, x24]\n"
    "fmla v15.4s, v24.4s, v5.4s\n"
    "ldr q27, [%[inptr0], x27]\n"
    "fmla v16.4s, v22.4s, v5.4s\n"
    "ldr q25, [x28, %[input_col_stride1]]\n"
    "fmla v17.4s, v22.4s, v10.4s\n"
    "ldr q24, [x23, x24]\n"
    "fmla v15.4s, v22.4s, v4.4s\n"
    "ldr q21, [x21, x27]\n"
    "fmla v14.4s, v22.4s, v13.4s\n"
    "ldr q7, [x28, x24]\n"
    "fmla v17.4s, v9.4s, v2.4s\n"
    "ldr q19, [x23, x27]\n"
    "fmla v16.4s, v9.4s, v4.4s\n"
    "ldr q18, [x28, x27]\n"
    "fmla v15.4s, v8.4s, v12.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v17.4s, v20.4s, v0.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v16.4s, v20.4s, v12.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v15.4s, v20.4s, v10.4s\n"
    "add x21, x21, #16\n"
    "fmla v14.4s, v20.4s, v5.4s\n"
    "add x23, x23, #16\n"
    "fmla v17.4s, v26.4s, v1.4s\n"
    "add x28, x28, #16\n"
    "fmla v15.4s, v26.4s, v2.4s\n"
    "fmla v16.4s, v26.4s, v10.4s\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "movi v26.16b, #0\n"
    "fmla v17.4s, v24.4s, v3.4s\n"
    "fmla v16.4s, v27.4s, v2.4s\n"
    "fmla v15.4s, v25.4s, v0.4s\n"
    "fmla v14.4s, v25.4s, v12.4s\n"
    "fmax v17.4s, v17.4s, v26.4s\n"
    "fmla v16.4s, v24.4s, v0.4s\n"
    "str q17, [%[outptr0]]\n"
    "fmla v15.4s, v24.4s, v1.4s\n"
    "fmla v14.4s, v24.4s, v10.4s\n"
    "fmla v16.4s, v21.4s, v1.4s\n"
    "fmla v15.4s, v7.4s, v3.4s\n"
    "fmla v14.4s, v21.4s, v2.4s\n"
    "fmla v16.4s, v19.4s, v3.4s\n"
    "fmax v15.4s, v15.4s, v26.4s\n"
    "fmla v14.4s, v7.4s, v0.4s\n"
    "str q15, [x22]\n"
    "fmax v16.4s, v16.4s, v26.4s\n"
    "fmla v14.4s, v19.4s, v1.4s\n"
    "str q16, [%[outptr0], %[output_col_stride1]]\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmla v14.4s, v18.4s, v3.4s\n"
    "fmax v14.4s, v14.4s, v26.4s\n"
    "str q14, [x22, %[output_col_stride1]]\n"
    "add x22, x22, #16\n"
    "4:\n"
    "cbz x25, 7f\n"
    "ldr s11, [%[wbptr]]\n"
    "mov v17.16b, v11.16b\n"
    "ldr s13, [%[wbptr], #4]\n"
    "mov v15.16b, v11.16b\n"
    "ldr s4, [%[wbptr], #8]\n"
    "mov v16.16b, v11.16b\n"
    "ldr s2, [%[wbptr], #12]\n"
    "mov v14.16b, v11.16b\n"
    "ldr s5, [%[wbptr], #16]\n"
    "ldr s10, [%[wbptr], #20]\n"
    "subs x25, x25, #1\n"
    "ldr s1, [%[wbptr], #24]\n"
    "ldr s12, [%[wbptr], #28]\n"
    "ldr s0, [%[wbptr], #32]\n"
    "ldr s3, [%[wbptr], #36]\n"
    "ldr s6, [%[inptr0]]\n"
    "ldr s27, [x21]\n"
    "fmla v17.4s, v6.4s, v13.4s\n"
    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v15.4s, v27.4s, v13.4s\n"
    "ldr s24, [x23]\n"
    "fmla v16.4s, v23.4s, v13.4s\n"
    "ldr s22, [x21, %[input_col_stride1]]\n"
    "fmla v17.4s, v27.4s, v5.4s\n"
    "ldr s9, [%[inptr0], x24]\n"
    "ldr s8, [x28]\n"
    "ldr s20, [x23, %[input_col_stride1]]\n"
    "fmla v17.4s, v23.4s, v4.4s\n"
    "beq 6f\n"
    "5:\n"
    "fmla v17.4s, v24.4s, v12.4s\n"
    "ldr s26, [x21, x24]\n"
    "fmla v15.4s, v24.4s, v5.4s\n"
    "ldr s27, [%[inptr0], x27]\n"
    "fmla v16.4s, v22.4s, v5.4s\n"
    "ldr s25, [x28, %[input_col_stride1]]\n"
    "fmla v17.4s, v22.4s, v10.4s\n"
    "ldr s24, [x23, x24]\n"
    "fmla v15.4s, v22.4s, v4.4s\n"
    "ldr s21, [x21, x27]\n"
    "fmla v14.4s, v22.4s, v13.4s\n"
    "ldr s7, [x28, x24]\n"
    "fmla v17.4s, v9.4s, v2.4s\n"
    "ldr s19, [x23, x27]\n"
    "fmla v16.4s, v9.4s, v4.4s\n"
    "ldr s18, [x28, x27]\n"
    "fmla v15.4s, v8.4s, v12.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v17.4s, v20.4s, v0.4s\n"
    "ldr s11, [%[wbptr]]\n"
    "fmla v16.4s, v20.4s, v12.4s\n"
    "ldr s13, [%[wbptr], #4]\n"
    "fmla v15.4s, v20.4s, v10.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v14.4s, v20.4s, v5.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v17.4s, v26.4s, v1.4s\n"
    "ldr s6, [%[inptr0]]\n"
    "fmla v15.4s, v26.4s, v2.4s\n"
    "ldr s23, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v16.4s, v26.4s, v10.4s\n"
    "ldr s5, [%[wbptr], #16]\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "ldr s9, [%[inptr0], x24]\n"
    "fmla v15.4s, v25.4s, v0.4s\n"
    "add x21, x21, #4\n"
    "fmla v16.4s, v27.4s, v2.4s\n"
    "ldr s27, [x21]\n"
    "fmla v14.4s, v25.4s, v12.4s\n"
    "ldr s4, [%[wbptr], #8]\n"
    "fmla v17.4s, v24.4s, v3.4s\n"
    "ldr s22, [x21, %[input_col_stride1]]\n"
    "fmla v15.4s, v24.4s, v1.4s\n"
    "add x23, x23, #4\n"
    "fmla v16.4s, v24.4s, v0.4s\n"
    "ldr s12, [%[wbptr], #28]\n"
    "fmla v14.4s, v24.4s, v10.4s\n"
    "ldr s24, [x23]\n"
    "fmla v15.4s, v7.4s, v3.4s\n"
    "ldr s20, [x23, %[input_col_stride1]]\n"
    "fmla v16.4s, v21.4s, v1.4s\n"
    "add x28, x28, #4\n"
    "fmla v14.4s, v21.4s, v2.4s\n"
    "ldr s10, [%[wbptr], #20]\n"
    "movi v26.16b, #0\n"
    "ldr s8, [x28]\n"
    "fmla v16.4s, v19.4s, v3.4s\n"
    "subs x25, x25, #1\n"
    "fmla v14.4s, v7.4s, v0.4s\n"
    "ldr s2, [%[wbptr], #12]\n"
    "fmax v17.4s, v17.4s, v26.4s\n"
    "fmax v15.4s, v15.4s, v26.4s\n"
    "fmax v16.4s, v16.4s, v26.4s\n"
    "str s17, [%[outptr0]]\n"
    "str s16, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v14.4s, v19.4s, v1.4s\n"
    "str s15, [x22]\n"
    "mov v17.16b, v11.16b\n"
    "mov v15.16b, v11.16b\n"
    "ldr s0, [%[wbptr], #32]\n"
    "fmla v14.4s, v18.4s, v3.4s\n"
    "ldr s1, [%[wbptr], #24]\n"
    "mov v16.16b, v11.16b\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmla v17.4s, v6.4s, v13.4s\n"
    "fmla v15.4s, v27.4s, v13.4s\n"
    "fmax v14.4s, v14.4s, v26.4s\n"
    "ldr s3, [%[wbptr], #36]\n"
    "fmla v16.4s, v23.4s, v13.4s\n"
    "str s14, [x22, %[output_col_stride1]]\n"
    "mov v14.16b, v11.16b\n"
    "add x22, x22, #4\n"
    "fmla v17.4s, v27.4s, v5.4s\n"
    "fmla v17.4s, v23.4s, v4.4s\n"
    "bne 5b\n"
    "6:\n"
    "fmla v17.4s, v24.4s, v12.4s\n"
    "ldr s26, [x21, x24]\n"
    "fmla v15.4s, v24.4s, v5.4s\n"
    "ldr s27, [%[inptr0], x27]\n"
    "fmla v16.4s, v22.4s, v5.4s\n"
    "ldr s25, [x28, %[input_col_stride1]]\n"
    "fmla v17.4s, v22.4s, v10.4s\n"
    "ldr s24, [x23, x24]\n"
    "fmla v15.4s, v22.4s, v4.4s\n"
    "ldr s21, [x21, x27]\n"
    "fmla v14.4s, v22.4s, v13.4s\n"
    "ldr s7, [x28, x24]\n"
    "fmla v17.4s, v9.4s, v2.4s\n"
    "ldr s19, [x23, x27]\n"
    "fmla v16.4s, v9.4s, v4.4s\n"
    "ldr s18, [x28, x27]\n"
    "fmla v15.4s, v8.4s, v12.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v17.4s, v20.4s, v0.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v16.4s, v20.4s, v12.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v15.4s, v20.4s, v10.4s\n"
    "add x21, x21, #4\n"
    "fmla v14.4s, v20.4s, v5.4s\n"
    "add x23, x23, #4\n"
    "fmla v17.4s, v26.4s, v1.4s\n"
    "add x28, x28, #4\n"
    "fmla v15.4s, v26.4s, v2.4s\n"
    "fmla v16.4s, v26.4s, v10.4s\n"
    "fmla v14.4s, v26.4s, v4.4s\n"
    "movi v26.16b, #0\n"
    "fmla v17.4s, v24.4s, v3.4s\n"
    "fmla v16.4s, v27.4s, v2.4s\n"
    "fmla v15.4s, v25.4s, v0.4s\n"
    "fmla v14.4s, v25.4s, v12.4s\n"
    "fmax v17.4s, v17.4s, v26.4s\n"
    "fmla v16.4s, v24.4s, v0.4s\n"
    "str s17, [%[outptr0]]\n"
    "fmla v15.4s, v24.4s, v1.4s\n"
    "fmla v14.4s, v24.4s, v10.4s\n"
    "fmla v16.4s, v21.4s, v1.4s\n"
    "fmla v15.4s, v7.4s, v3.4s\n"
    "fmla v14.4s, v21.4s, v2.4s\n"
    "fmla v16.4s, v19.4s, v3.4s\n"
    "fmax v15.4s, v15.4s, v26.4s\n"
    "fmla v14.4s, v7.4s, v0.4s\n"
    "str s15, [x22]\n"
    "fmax v16.4s, v16.4s, v26.4s\n"
    "fmla v14.4s, v19.4s, v1.4s\n"
    "str s16, [%[outptr0], %[output_col_stride1]]\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmla v14.4s, v18.4s, v3.4s\n"
    "fmax v14.4s, v14.4s, v26.4s\n"
    "str s14, [x22, %[output_col_stride1]]\n"
    "add x22, x22, #4\n"
    "7:\n"
    : [inptr0] "+r" (input), [outptr0] "+r" (output), [wbptr] "+r" (weight_bias_ptr)
    : [n_channels] "r" ((long) n_channels), [input_row_stride] "r" (input_row_stride * sizeof(float)), [output_row_stride] "r" (output_row_stride * sizeof(float)), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float))
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
  );
}

template <>
template <>
void Conv::execute_tile<ActivationFunction::ReLU6>(
  int n_channels,
  const void *weight_bias_ptr,
  const float *input,
  const unsigned int input_row_stride,
  const unsigned int input_col_stride,
  float *output,
  const unsigned int output_row_stride,
  const unsigned int output_col_stride
)
{
  __asm __volatile(
    "add x21, %[inptr0], %[input_row_stride]\n"
    "add x23, %[input_col_stride1], %[input_col_stride1]\n"
    "add x24, %[outptr0], %[output_row_stride]\n"
    "add x27, x21, %[input_row_stride]\n"
    "add x22, x23, %[input_col_stride1]\n"
    "and x25, %[n_channels], #3\n"
    "add x28, x27, %[input_row_stride]\n"
    "lsr x26, %[n_channels], #2\n"
    "cbz x26, 4f\n"
    "1:\n"
    "ldr q19, [%[wbptr]]\n"
    "subs x26, x26, #1\n"
    "mov v3.16b, v19.16b\n"
    "ldr q12, [%[wbptr], #16]\n"
    "mov v1.16b, v19.16b\n"
    "ldr q11, [%[wbptr], #32]\n"
    "mov v2.16b, v19.16b\n"
    "ldr q10, [%[wbptr], #48]\n"
    "mov v0.16b, v19.16b\n"
    "ldr q13, [%[wbptr], #64]\n"
    "ldr q23, [%[wbptr], #80]\n"
    "ldr q15, [%[wbptr], #96]\n"
    "ldr q20, [%[wbptr], #112]\n"
    "ldr q21, [%[wbptr], #128]\n"
    "ldr q14, [%[wbptr], #144]\n"
    "ldr q16, [%[inptr0]]\n"
    "fmla v3.4s, v16.4s, v12.4s\n"
    "ldr q28, [x21]\n"
    "fmla v1.4s, v28.4s, v12.4s\n"
    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "ldr q24, [x27]\n"
    "fmla v3.4s, v28.4s, v13.4s\n"
    "ldr q8, [x21, %[input_col_stride1]]\n"
    "ldr q9, [%[inptr0], x23]\n"
    "ldr q18, [x28]\n"
    "ldr q6, [x27, %[input_col_stride1]]\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "beq 3f\n"
    "2:\n"
    "fmla v3.4s, v24.4s, v20.4s\n"
    "ldr q25, [x21, x23]\n"
    "fmla v1.4s, v24.4s, v13.4s\n"
    "ldr q28, [%[inptr0], x22]\n"
    "fmla v2.4s, v8.4s, v13.4s\n"
    "ldr q24, [x28, %[input_col_stride1]]\n"
    "fmla v3.4s, v8.4s, v23.4s\n"
    "ldr q27, [x27, x23]\n"
    "fmla v1.4s, v8.4s, v11.4s\n"
    "ldr q7, [x21, x22]\n"
    "fmla v0.4s, v8.4s, v12.4s\n"
    "ldr q17, [x28, x23]\n"
    "fmla v3.4s, v9.4s, v10.4s\n"
    "ldr q5, [x27, x22]\n"
    "fmla v2.4s, v9.4s, v11.4s\n"
    "ldr q4, [x28, x22]\n"
    "fmla v1.4s, v18.4s, v20.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v3.4s, v6.4s, v21.4s\n"
    "ldr q19, [%[wbptr]]\n"
    "fmla v2.4s, v6.4s, v20.4s\n"
    "ldr q12, [%[wbptr], #16]\n"
    "fmla v1.4s, v6.4s, v23.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v0.4s, v6.4s, v13.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v3.4s, v25.4s, v15.4s\n"
    "ldr q16, [%[inptr0]]\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "ldr q22, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v2.4s, v25.4s, v23.4s\n"
    "ldr q13, [%[wbptr], #64]\n"
    "fmla v0.4s, v25.4s, v11.4s\n"
    "ldr q9, [%[inptr0], x23]\n"
    "fmla v1.4s, v24.4s, v21.4s\n"
    "add x21, x21, #16\n"
    "fmla v2.4s, v28.4s, v10.4s\n"
    "ldr q28, [x21]\n"
    "fmla v0.4s, v24.4s, v20.4s\n"
    "ldr q11, [%[wbptr], #32]\n"
    "fmla v3.4s, v27.4s, v14.4s\n"
    "ldr q8, [x21, %[input_col_stride1]]\n"
    "fmla v1.4s, v27.4s, v15.4s\n"
    "add x27, x27, #16\n"
    "fmla v2.4s, v27.4s, v21.4s\n"
    "ldr q20, [%[wbptr], #112]\n"
    "fmla v0.4s, v27.4s, v23.4s\n"
    "ldr q24, [x27]\n"
    "fmla v1.4s, v17.4s, v14.4s\n"
    "ldr q6, [x27, %[input_col_stride1]]\n"
    "fmla v2.4s, v7.4s, v15.4s\n"
    "add x28, x28, #16\n"
    "fmla v0.4s, v7.4s, v10.4s\n"
    "ldr q23, [%[wbptr], #80]\n"
    "movi v25.16b, #0\n"
    "ldr q18, [x28]\n"
    "fmla v2.4s, v5.4s, v14.4s\n"
    "subs x26, x26, #1\n"
    "fmla v0.4s, v17.4s, v21.4s\n"
    "ldr q10, [%[wbptr], #48]\n"
    "fmov v26.4s, #6.0\n"
    "fmax v3.4s, v3.4s, v25.4s\n"
    "fmax v2.4s, v2.4s, v25.4s\n"
    "fmax v1.4s, v1.4s, v25.4s\n"
    "fmla v0.4s, v5.4s, v15.4s\n"
    "ldr q21, [%[wbptr], #128]\n"
    "fmin v3.4s, v3.4s, v26.4s\n"
    "fmin v2.4s, v2.4s, v26.4s\n"
    "fmin v1.4s, v1.4s, v26.4s\n"
    "str q3, [%[outptr0]]\n"
    "str q2, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v0.4s, v4.4s, v14.4s\n"
    "str q1, [x24]\n"
    "mov v3.16b, v19.16b\n"
    "mov v1.16b, v19.16b\n"
    "ldr q15, [%[wbptr], #96]\n"
    "fmax v0.4s, v0.4s, v25.4s\n"
    "ldr q14, [%[wbptr], #144]\n"
    "mov v2.16b, v19.16b\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmin v0.4s, v0.4s, v26.4s\n"
    "fmla v3.4s, v16.4s, v12.4s\n"
    "fmla v1.4s, v28.4s, v12.4s\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "str q0, [x24, %[output_col_stride1]]\n"
    "mov v0.16b, v19.16b\n"
    "fmla v3.4s, v28.4s, v13.4s\n"
    "add x24, x24, #16\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "bne 2b\n"
    "3:\n"
    "fmla v3.4s, v24.4s, v20.4s\n"
    "ldr q25, [x21, x23]\n"
    "fmla v1.4s, v24.4s, v13.4s\n"
    "ldr q28, [%[inptr0], x22]\n"
    "fmla v2.4s, v8.4s, v13.4s\n"
    "ldr q24, [x28, %[input_col_stride1]]\n"
    "fmla v3.4s, v8.4s, v23.4s\n"
    "ldr q27, [x27, x23]\n"
    "fmla v1.4s, v8.4s, v11.4s\n"
    "ldr q7, [x21, x22]\n"
    "fmla v0.4s, v8.4s, v12.4s\n"
    "ldr q17, [x28, x23]\n"
    "fmla v3.4s, v9.4s, v10.4s\n"
    "ldr q5, [x27, x22]\n"
    "fmla v2.4s, v9.4s, v11.4s\n"
    "ldr q4, [x28, x22]\n"
    "fmla v1.4s, v18.4s, v20.4s\n"
    "add %[wbptr], %[wbptr], #160\n"
    "fmla v3.4s, v6.4s, v21.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v2.4s, v6.4s, v20.4s\n"
    "add %[inptr0], %[inptr0], #16\n"
    "fmla v1.4s, v6.4s, v23.4s\n"
    "add x21, x21, #16\n"
    "fmla v0.4s, v6.4s, v13.4s\n"
    "add x27, x27, #16\n"
    "fmla v3.4s, v25.4s, v15.4s\n"
    "add x28, x28, #16\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "fmla v2.4s, v25.4s, v23.4s\n"
    "fmla v0.4s, v25.4s, v11.4s\n"
    "movi v25.16b, #0\n"
    "fmla v3.4s, v27.4s, v14.4s\n"
    "fmov v26.4s, #6.0\n"
    "fmla v2.4s, v28.4s, v10.4s\n"
    "fmla v1.4s, v24.4s, v21.4s\n"
    "fmla v0.4s, v24.4s, v20.4s\n"
    "fmax v3.4s, v3.4s, v25.4s\n"
    "fmla v1.4s, v27.4s, v15.4s\n"
    "fmla v2.4s, v27.4s, v21.4s\n"
    "fmla v0.4s, v27.4s, v23.4s\n"
    "fmin v3.4s, v3.4s, v26.4s\n"
    "str q3, [%[outptr0]]\n"
    "fmla v2.4s, v7.4s, v15.4s\n"
    "fmla v0.4s, v7.4s, v10.4s\n"
    "fmla v1.4s, v17.4s, v14.4s\n"
    "fmla v2.4s, v5.4s, v14.4s\n"
    "fmla v0.4s, v17.4s, v21.4s\n"
    "fmax v1.4s, v1.4s, v25.4s\n"
    "fmax v2.4s, v2.4s, v25.4s\n"
    "fmla v0.4s, v5.4s, v15.4s\n"
    "fmin v1.4s, v1.4s, v26.4s\n"
    "fmin v2.4s, v2.4s, v26.4s\n"
    "str q1, [x24]\n"
    "str q2, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v0.4s, v4.4s, v14.4s\n"
    "add %[outptr0], %[outptr0], #16\n"
    "fmax v0.4s, v0.4s, v25.4s\n"
    "fmin v0.4s, v0.4s, v26.4s\n"
    "str q0, [x24, %[output_col_stride1]]\n"
    "add x24, x24, #16\n"
    "4:\n"
    "cbz x25, 7f\n"
    "ldr s19, [%[wbptr]]\n"
    "mov v3.16b, v19.16b\n"
    "ldr s12, [%[wbptr], #4]\n"
    "mov v1.16b, v19.16b\n"
    "ldr s11, [%[wbptr], #8]\n"
    "mov v2.16b, v19.16b\n"
    "ldr s10, [%[wbptr], #12]\n"
    "mov v0.16b, v19.16b\n"
    "ldr s13, [%[wbptr], #16]\n"
    "ldr s23, [%[wbptr], #20]\n"
    "subs x25, x25, #1\n"
    "ldr s15, [%[wbptr], #24]\n"
    "ldr s20, [%[wbptr], #28]\n"
    "ldr s21, [%[wbptr], #32]\n"
    "ldr s14, [%[wbptr], #36]\n"
    "ldr s16, [%[inptr0]]\n"
    "ldr s28, [x21]\n"
    "fmla v3.4s, v16.4s, v12.4s\n"
    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v1.4s, v28.4s, v12.4s\n"
    "ldr s24, [x27]\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "ldr s8, [x21, %[input_col_stride1]]\n"
    "fmla v3.4s, v28.4s, v13.4s\n"
    "ldr s9, [%[inptr0], x23]\n"
    "ldr s18, [x28]\n"
    "ldr s6, [x27, %[input_col_stride1]]\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "beq 6f\n"
    "5:\n"
    "fmla v3.4s, v24.4s, v20.4s\n"
    "ldr s25, [x21, x23]\n"
    "fmla v1.4s, v24.4s, v13.4s\n"
    "ldr s28, [%[inptr0], x22]\n"
    "fmla v2.4s, v8.4s, v13.4s\n"
    "ldr s24, [x28, %[input_col_stride1]]\n"
    "fmla v3.4s, v8.4s, v23.4s\n"
    "ldr s27, [x27, x23]\n"
    "fmla v1.4s, v8.4s, v11.4s\n"
    "ldr s7, [x21, x22]\n"
    "fmla v0.4s, v8.4s, v12.4s\n"
    "ldr s17, [x28, x23]\n"
    "fmla v3.4s, v9.4s, v10.4s\n"
    "ldr s5, [x27, x22]\n"
    "fmla v2.4s, v9.4s, v11.4s\n"
    "ldr s4, [x28, x22]\n"
    "fmla v1.4s, v18.4s, v20.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v3.4s, v6.4s, v21.4s\n"
    "ldr s19, [%[wbptr]]\n"
    "fmla v2.4s, v6.4s, v20.4s\n"
    "ldr s12, [%[wbptr], #4]\n"
    "fmla v1.4s, v6.4s, v23.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v0.4s, v6.4s, v13.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v3.4s, v25.4s, v15.4s\n"
    "ldr s16, [%[inptr0]]\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "ldr s22, [%[inptr0], %[input_col_stride1]]\n"
    "fmla v2.4s, v25.4s, v23.4s\n"
    "ldr s13, [%[wbptr], #16]\n"
    "fmla v0.4s, v25.4s, v11.4s\n"
    "ldr s9, [%[inptr0], x23]\n"
    "fmla v1.4s, v24.4s, v21.4s\n"
    "add x21, x21, #4\n"
    "fmla v2.4s, v28.4s, v10.4s\n"
    "ldr s28, [x21]\n"
    "fmla v0.4s, v24.4s, v20.4s\n"
    "ldr s11, [%[wbptr], #8]\n"
    "fmla v3.4s, v27.4s, v14.4s\n"
    "ldr s8, [x21, %[input_col_stride1]]\n"
    "fmla v1.4s, v27.4s, v15.4s\n"
    "add x27, x27, #4\n"
    "fmla v2.4s, v27.4s, v21.4s\n"
    "ldr s20, [%[wbptr], #28]\n"
    "fmla v0.4s, v27.4s, v23.4s\n"
    "ldr s24, [x27]\n"
    "fmla v1.4s, v17.4s, v14.4s\n"
    "ldr s6, [x27, %[input_col_stride1]]\n"
    "fmla v2.4s, v7.4s, v15.4s\n"
    "add x28, x28, #4\n"
    "fmla v0.4s, v7.4s, v10.4s\n"
    "ldr s23, [%[wbptr], #20]\n"
    "movi v25.16b, #0\n"
    "ldr s18, [x28]\n"
    "fmla v2.4s, v5.4s, v14.4s\n"
    "subs x25, x25, #1\n"
    "fmla v0.4s, v17.4s, v21.4s\n"
    "ldr s10, [%[wbptr], #12]\n"
    "fmov v26.4s, #6.0\n"
    "fmax v3.4s, v3.4s, v25.4s\n"
    "fmax v2.4s, v2.4s, v25.4s\n"
    "fmax v1.4s, v1.4s, v25.4s\n"
    "fmla v0.4s, v5.4s, v15.4s\n"
    "ldr s21, [%[wbptr], #32]\n"
    "fmin v3.4s, v3.4s, v26.4s\n"
    "fmin v2.4s, v2.4s, v26.4s\n"
    "fmin v1.4s, v1.4s, v26.4s\n"
    "str s3, [%[outptr0]]\n"
    "str s2, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v0.4s, v4.4s, v14.4s\n"
    "str s1, [x24]\n"
    "mov v3.16b, v19.16b\n"
    "mov v1.16b, v19.16b\n"
    "ldr s15, [%[wbptr], #24]\n"
    "fmax v0.4s, v0.4s, v25.4s\n"
    "ldr s14, [%[wbptr], #36]\n"
    "mov v2.16b, v19.16b\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmin v0.4s, v0.4s, v26.4s\n"
    "fmla v3.4s, v16.4s, v12.4s\n"
    "fmla v1.4s, v28.4s, v12.4s\n"
    "fmla v2.4s, v22.4s, v12.4s\n"
    "str s0, [x24, %[output_col_stride1]]\n"
    "mov v0.16b, v19.16b\n"
    "fmla v3.4s, v28.4s, v13.4s\n"
    "add x24, x24, #4\n"
    "fmla v3.4s, v22.4s, v11.4s\n"
    "bne 5b\n"
    "6:\n"
    "fmla v3.4s, v24.4s, v20.4s\n"
    "ldr s25, [x21, x23]\n"
    "fmla v1.4s, v24.4s, v13.4s\n"
    "ldr s28, [%[inptr0], x22]\n"
    "fmla v2.4s, v8.4s, v13.4s\n"
    "ldr s24, [x28, %[input_col_stride1]]\n"
    "fmla v3.4s, v8.4s, v23.4s\n"
    "ldr s27, [x27, x23]\n"
    "fmla v1.4s, v8.4s, v11.4s\n"
    "ldr s7, [x21, x22]\n"
    "fmla v0.4s, v8.4s, v12.4s\n"
    "ldr s17, [x28, x23]\n"
    "fmla v3.4s, v9.4s, v10.4s\n"
    "ldr s5, [x27, x22]\n"
    "fmla v2.4s, v9.4s, v11.4s\n"
    "ldr s4, [x28, x22]\n"
    "fmla v1.4s, v18.4s, v20.4s\n"
    "add %[wbptr], %[wbptr], #40\n"
    "fmla v3.4s, v6.4s, v21.4s\n"
    "prfm pldl1keep, [%[wbptr], #64]\n"
    "fmla v2.4s, v6.4s, v20.4s\n"
    "add %[inptr0], %[inptr0], #4\n"
    "fmla v1.4s, v6.4s, v23.4s\n"
    "add x21, x21, #4\n"
    "fmla v0.4s, v6.4s, v13.4s\n"
    "add x27, x27, #4\n"
    "fmla v3.4s, v25.4s, v15.4s\n"
    "add x28, x28, #4\n"
    "fmla v1.4s, v25.4s, v10.4s\n"
    "fmla v2.4s, v25.4s, v23.4s\n"
    "fmla v0.4s, v25.4s, v11.4s\n"
    "movi v25.16b, #0\n"
    "fmla v3.4s, v27.4s, v14.4s\n"
    "fmov v26.4s, #6.0\n"
    "fmla v2.4s, v28.4s, v10.4s\n"
    "fmla v1.4s, v24.4s, v21.4s\n"
    "fmla v0.4s, v24.4s, v20.4s\n"
    "fmax v3.4s, v3.4s, v25.4s\n"
    "fmla v1.4s, v27.4s, v15.4s\n"
    "fmla v2.4s, v27.4s, v21.4s\n"
    "fmla v0.4s, v27.4s, v23.4s\n"
    "fmin v3.4s, v3.4s, v26.4s\n"
    "str s3, [%[outptr0]]\n"
    "fmla v2.4s, v7.4s, v15.4s\n"
    "fmla v0.4s, v7.4s, v10.4s\n"
    "fmla v1.4s, v17.4s, v14.4s\n"
    "fmla v2.4s, v5.4s, v14.4s\n"
    "fmla v0.4s, v17.4s, v21.4s\n"
    "fmax v1.4s, v1.4s, v25.4s\n"
    "fmax v2.4s, v2.4s, v25.4s\n"
    "fmla v0.4s, v5.4s, v15.4s\n"
    "fmin v1.4s, v1.4s, v26.4s\n"
    "fmin v2.4s, v2.4s, v26.4s\n"
    "str s1, [x24]\n"
    "str s2, [%[outptr0], %[output_col_stride1]]\n"
    "fmla v0.4s, v4.4s, v14.4s\n"
    "add %[outptr0], %[outptr0], #4\n"
    "fmax v0.4s, v0.4s, v25.4s\n"
    "fmin v0.4s, v0.4s, v26.4s\n"
    "str s0, [x24, %[output_col_stride1]]\n"
    "add x24, x24, #4\n"
    "7:\n"
    : [outptr0] "+r" (output), [inptr0] "+r" (input), [wbptr] "+r" (weight_bias_ptr)
    : [output_row_stride] "r" (output_row_stride * sizeof(float)), [n_channels] "r" ((long) n_channels), [input_col_stride1] "r" (input_col_stride * sizeof(float)), [output_col_stride1] "r" (output_col_stride * sizeof(float)), [input_row_stride] "r" (input_row_stride * sizeof(float))
    : "cc", "v0", "v1", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v2", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "memory"
  );
}

#endif  // __aarch64__

template class DepthwiseConvolution<2, 2, 3, 3, 1, 1, float, float, float>;

}  // namespace depthwise
